Python new_siteの例、brozzler.new_site Pythonの例

コード例 #1

0

ファイルを表示

def test_seed_redirect(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the pages table
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    pages.sort(key=lambda page: page.hops_from_seed)
    assert pages[0].hops_from_seed == 0
    assert pages[0].url == seed_url
    assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port

    # check that scope has been updated properly
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port

コード例 #2

0

ファイルを表示

def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        'http://localhost:%s/site7/foo.html' % httpd.server_port
    ]
    assert not pages[0].hashtags
    assert pages[
        1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#boosh',
        '#ignored',
        '#whee',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    captures_by_url = {
        c['url']: c
        for c in captures if c['http_method'] != 'HEAD'
    }
    assert seed_url in captures_by_url
    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url

コード例 #3

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: n0ncetonic/brozzler

def test_hashtag_seed():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # no hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    assert site.scope['surt'] == 'http://(org,example,)/'

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert not pages[0].hashtags

    # yes hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
    brozzler.new_site(frontier, site)

    assert site.scope['surt'] == 'http://(org,example,)/'

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert pages[0].hashtags == ['#hash',]

コード例 #4

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: galgeek/brozzler

def test_hashtag_seed():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    # no hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert not pages[0].hashtags

    # yes hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
    brozzler.new_site(frontier, site)

    assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}

    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert pages[0].hashtags == ['#hash',]

コード例 #5

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: galgeek/brozzler

def test_seed_redirect(httpd):
    test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the pages table
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    pages.sort(key=lambda page: page.hops_from_seed)
    assert pages[0].hops_from_seed == 0
    assert pages[0].url == seed_url
    assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port

    # check that scope has been updated properly
    assert site.scope == {'accepts': [
        {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
        {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}

コード例 #6

0

ファイルを表示

def test_obey_robots(httpd):
    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(
        rr,
        {
            'seed': make_url(httpd, '/site1/'),
            'user_agent': 'im a badbot',  # robots.txt blocks badbot
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        frontier = brozzler.RethinkDbFrontier(rr)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        site_pages = list(frontier.site_pages(site.id))
        assert len(site_pages) == 1
        assert site_pages[0].url == site.seed
        assert site_pages[0].needs_robots_check
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that only the one page is in rethinkdb
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    assert page.url == make_url(httpd, '/site1/')
    assert page.blocked_by_robots

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = make_url(httpd, '/robots.txt')
    captures = list(rr.table('captures').filter({'test_id': test_id}).run())
    assert len(captures) == 1
    assert captures[0]['url'] == robots_url

    # check pywb
    t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
    expected_payload = open(
        os.path.join(os.path.dirname(__file__), 'htdocs', 'robots.txt'),
        'rb').read()
    assert requests.get(wb_url,
                        allow_redirects=False).content == expected_payload

コード例 #7

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: BitBaron/brozzler

def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            proxy='localhost:8000', enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        r = rethinkstuff.Rethinker('localhost', db='brozzler')
        frontier = brozzler.RethinkDbFrontier(r)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        assert len(list(frontier.site_pages(site.id))) == 1
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 3
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
            'http://localhost:%s/robots.txt' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext

    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload

コード例 #8

0

ファイルを表示

ファイル: cli.py プロジェクト: galgeek/brozzler

def brozzler_new_site(argv=None):
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    arg_parser.add_argument(
            '--behavior-parameters', dest='behavior_parameters',
            default=None, help=(
                'json blob of parameters to populate the javascript behavior '
                'template, e.g. {"parameter_username":"******",'
                '"parameter_password":"******"}'))
    arg_parser.add_argument(
            '--username', dest='username', default=None,
            help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
            '--password', dest='password', default=None,
            help='use this password to try to log in if a login form is found')
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    site = brozzler.Site(rr, {
        'seed': args.seed,
        'time_limit': int(args.time_limit) if args.time_limit else None,
        'ignore_robots': args.ignore_robots,
        'warcprox_meta': json.loads(
            args.warcprox_meta) if args.warcprox_meta else None,
        'behavior_parameters': json.loads(
            args.behavior_parameters) if args.behavior_parameters else None,
        'username': args.username,
        'password': args.password})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

コード例 #9

0

ファイルを表示

ファイル: cli.py プロジェクト: vishalbelsare/brozzler

def brozzler_new_site(argv=None):
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    arg_parser.add_argument(
            '--behavior-parameters', dest='behavior_parameters',
            default=None, help=(
                'json blob of parameters to populate the javascript behavior '
                'template, e.g. {"parameter_username":"******",'
                '"parameter_password":"******"}'))
    arg_parser.add_argument(
            '--username', dest='username', default=None,
            help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
            '--password', dest='password', default=None,
            help='use this password to try to log in if a login form is found')
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)
    site = brozzler.Site(rr, {
        'seed': args.seed,
        'time_limit': int(args.time_limit) if args.time_limit else None,
        'ignore_robots': args.ignore_robots,
        'warcprox_meta': json.loads(
            args.warcprox_meta) if args.warcprox_meta else None,
        'behavior_parameters': json.loads(
            args.behavior_parameters) if args.behavior_parameters else None,
        'username': args.username,
        'password': args.password})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

コード例 #10

0

ファイルを表示

def test_login(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(
        rr, {
            'seed': 'http://localhost:%s/site2/' % httpd.server_port,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            },
            'username': '******',
            'password': '******'
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the captures table
    time.sleep(2)  # in case warcprox hasn't finished processing urls
    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
    captures = list(
        rr.table('captures').filter({
            'test_id': test_id
        }).order_by('timestamp').run())
    meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]

    # there are several forms in in htdocs/site2/login.html but only one
    # that brozzler's heuristic should match and try to submit, and it has
    # action='00', so we can check for that here
    assert ('POST http://localhost:%s/site2/00' %
            httpd.server_port) in meth_url

    # sanity check the rest of the crawl
    assert ('GET http://localhost:%s/robots.txt' %
            httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' %
            httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' %
            httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/login.html' %
            httpd.server_port) in meth_url
    assert (
        'WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html'
        % httpd.server_port) in meth_url
    assert (
        'WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html'
        % httpd.server_port) in meth_url

コード例 #11

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: wolfgang42/brozzler

def test_time_limit():
    # XXX test not thoroughly adapted to change in time accounting, since
    # starts_and_stops is no longer used to enforce time limits

    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'time_limit': 99999
    })
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # no time limit set
    frontier.enforce_time_limit(site)

    site.time_limit = 10
    site.claimed = True
    site.save()

    # time limit not reached yet
    frontier.enforce_time_limit(site)
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    time.sleep(0.1)

    with pytest.raises(brozzler.ReachedTimeLimit):
        frontier.enforce_time_limit(site)

コード例 #12

0

ファイルを表示

def test_redirect_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = make_url(httpd, '/site9/')
    site = brozzler.Site(
        rr, {
            'seed': seed_url,
            'warcprox_meta': {
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == [
        make_url(httpd, '/site9/redirect.html')
    ]
    assert not pages[0].hashtags
    assert pages[1].url == make_url(httpd, '/site9/redirect.html')
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == [
        '#hash1',
        '#hash2',
    ]

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id': test_id}).run()
    redirect_captures = [
        c for c in captures
        if c['url'] == make_url(httpd, '/site9/redirect.html')
        and c['http_method'] == 'GET'
    ]
    assert len(redirect_captures) == 2  # youtube-dl + browser, no hashtags

コード例 #13

0

ファイルを表示

def test_time_limit():
    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'time_limit': 99999
    })
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # time limit not reached yet
    frontier._enforce_time_limit(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    site.claimed = True
    site.save()

    time.sleep(0.1)
    frontier._enforce_time_limit(site)

    assert site.status == 'FINISHED_TIME_LIMIT'
    assert not site.claimed
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop']
    assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']

コード例 #14

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: BitBaron/brozzler

def test_obey_robots(httpd):
    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            proxy='localhost:8000', enable_warcprox_features=True,
            user_agent='im a badbot',   # robots.txt blocks badbot
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        r = rethinkstuff.Rethinker('localhost', db='brozzler')
        frontier = brozzler.RethinkDbFrontier(r)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        site_pages = list(frontier.site_pages(site.id))
        assert len(site_pages) == 1
        assert site_pages[0].url == site.seed
        assert site_pages[0].needs_robots_check
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port}

    # take a look at the captures table
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
    captures = list(r.table('captures').filter({'test_id':test_id}).run())
    assert len(captures) == 1
    assert captures[0]['url'] == robots_url

    # check pywb
    t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
    assert requests.get(
            wb_url, allow_redirects=False).content == expected_payload

コード例 #15

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: galgeek/brozzler

def test_time_limit():
    # XXX test not thoroughly adapted to change in time accounting, since
    # starts_and_stops is no longer used to enforce time limits

    # vagrant brozzler-worker isn't configured to look at the "ignoreme" db
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999})
    brozzler.new_site(frontier, site)

    site.refresh()  # get it back from the db
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop'] is None

    frontier.finished(site, 'FINISHED')

    assert site.status == 'FINISHED'
    assert len(site.starts_and_stops) == 1
    assert site.starts_and_stops[0]['start']
    assert site.starts_and_stops[0]['stop']
    assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']

    frontier.resume_site(site)

    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    # no time limit set
    frontier.enforce_time_limit(site)

    site.time_limit = 10
    site.claimed = True
    site.save()

    # time limit not reached yet
    frontier.enforce_time_limit(site)
    assert site.status == 'ACTIVE'
    assert len(site.starts_and_stops) == 2
    assert site.starts_and_stops[1]['start']
    assert site.starts_and_stops[1]['stop'] is None

    site.time_limit = 0.1
    time.sleep(0.1)

    with pytest.raises(brozzler.ReachedTimeLimit):
        frontier.enforce_time_limit(site)

コード例 #16

0

ファイルを表示

ファイル: cli.py プロジェクト: internetarchive/brozzler

def brozzler_new_site():
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    _add_rethinkdb_options(arg_parser)
    _add_proxy_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    arg_parser.add_argument(
            '--behavior-parameters', dest='behavior_parameters',
            default=None, help=(
                'json blob of parameters to populate the javascript behavior '
                'template, e.g. {"parameter_username":"******",'
                '"parameter_password":"******"}'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            seed=args.seed, proxy=args.proxy,
            time_limit=int(args.time_limit) if args.time_limit else None,
            ignore_robots=args.ignore_robots,
            enable_warcprox_features=args.enable_warcprox_features,
            warcprox_meta=json.loads(
                args.warcprox_meta) if args.warcprox_meta else None,
            behavior_parameters=json.loads(
                args.behavior_parameters) if args.behavior_parameters else None)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(","), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)

コード例 #17

0

ファイルを表示

def test_ydl_stitching(httpd):
    test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(
        rr, {
            'seed': make_url(httpd, '/site10/'),
            'warcprox_meta': {
                'warc-prefix': 'test_ydl_stitching',
                'captures-table-extra-fields': {
                    'test_id': test_id
                }
            }
        })
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check page.videos
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    while time.time() - start < 600 and not page.videos:
        time.sleep(0.5)
        page.refresh()
    assert len(page.videos) == 6
    stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
    assert {
        'blame': 'youtube-dl',
        'content-length': 267900,
        'content-type': 'video/mp4',
        'response_code': 204,
        'url': stitched_url,
    } in page.videos

    time.sleep(2)  # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = list(rr.table('captures').filter({'test_id': test_id}).run())
    l = [c for c in captures if c['url'] == stitched_url]
    assert len(l) == 1
    c = l[0]
    assert c['filename'].startswith('test_ydl_stitching')
    assert c['content_type'] == 'video/mp4'
    assert c['http_method'] == 'WARCPROX_WRITE_RECORD'

コード例 #18

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: mouse-reeve/brozzler

def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            proxy='localhost:8000', enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port

    assert site.id is None
    r = rethinkstuff.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port }

    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext

    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload

コード例 #19

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: galgeek/brozzler

def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': seed_url,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
    assert not pages[0].hashtags
    assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert seed_url in captures_by_url
    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url

コード例 #20

0

ファイルを表示

ファイル: cli.py プロジェクト: Cloudxtreme/brozzler

def brozzler_new_site():
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    _add_rethinkdb_options(arg_parser)
    _add_proxy_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            seed=args.seed, proxy=args.proxy,
            time_limit=int(args.time_limit) if args.time_limit else None,
            ignore_robots=args.ignore_robots,
            enable_warcprox_features=args.enable_warcprox_features,
            warcprox_meta=(
                json.loads(args.warcprox_meta) if args.warcprox_meta else None))

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(","), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)

コード例 #21

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: wolfgang42/brozzler

def test_claim_site():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    rr.table('sites').delete().run()  # clean slate

    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id
    assert claimed_site.claimed
    assert claimed_site.last_claimed >= doublethink.utcnow(
    ) - datetime.timedelta(minutes=1)
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed less than 1 hour ago still not to be reclaimed
    claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(
        minutes=55)
    claimed_site.save()
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed more than 1 hour ago can be reclaimed
    site = claimed_site
    claimed_site = None
    site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
    site.save()
    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id

    # clean up
    rr.table('sites').get(claimed_site.id).delete().run()

コード例 #22

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: galgeek/brozzler

def test_ydl_stitching(httpd):
    test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site10/' % httpd.server_port,
        'warcprox_meta':  {
            'warc-prefix': 'test_ydl_stitching',
            'captures-table-extra-fields': {'test_id':test_id}}})
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check page.videos
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
    assert len(page.videos) == 6
    stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port
    assert {
        'blame': 'youtube-dl',
        'content-length': 267900,
        'content-type': 'video/mp4',
        'response_code': 204,
        'url': stitched_url,
    } in page.videos

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = list(rr.table('captures').filter({'test_id':test_id}).run())
    l = [c for c in captures if c['url'] == stitched_url]
    assert len(l) == 1
    c = l[0]
    assert c['filename'].startswith('test_ydl_stitching')
    assert c['content_type'] == 'video/mp4'
    assert c['http_method'] == 'WARCPROX_WRITE_RECORD'

コード例 #23

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: galgeek/brozzler

def test_claim_site():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    rr.table('sites').delete().run() # clean slate

    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)

    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id
    assert claimed_site.claimed
    assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1)
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed less than 1 hour ago still not to be reclaimed
    claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55)
    claimed_site.save()
    with pytest.raises(brozzler.NothingToClaim):
        claimed_site = frontier.claim_sites()

    # site last_claimed more than 1 hour ago can be reclaimed
    site = claimed_site
    claimed_site = None
    site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
    site.save()
    claimed_sites = frontier.claim_sites()
    assert len(claimed_sites) == 1
    claimed_site = claimed_sites[0]
    assert claimed_site.id == site.id

    # clean up
    rr.table('sites').get(claimed_site.id).delete().run()

コード例 #24

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: Chunde/brozzler

def test_hashtag_links():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)
    parent_page = frontier.seed_page(site.id)
    assert not parent_page.hashtags
    outlinks = [
        'http://example.org/#foo',
        'http://example.org/bar',
        'http://example.org/bar#baz',
        'http://example.org/bar#quux',
        'http://example.org/zuh#buh',
    ]
    frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)

    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 3
    assert pages[0].url == 'http://example.org/'
    assert sorted(pages[0].outlinks['accepted']) == [
        'http://example.org/', 'http://example.org/bar',
        'http://example.org/zuh'
    ]
    assert not pages[0].outlinks['blocked']
    assert not pages[0].outlinks['rejected']
    assert pages[0].hashtags == [
        '#foo',
    ]
    assert pages[0].hops_from_seed == 0

    assert pages[1].url == 'http://example.org/bar'
    assert sorted(pages[1].hashtags) == ['#baz', '#quux']
    assert pages[1].priority == 36
    assert pages[1].hops_from_seed == 1

    assert pages[2].url == 'http://example.org/zuh'
    assert pages[2].hashtags == ['#buh']
    assert pages[2].priority == 12

コード例 #25

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: galgeek/brozzler

def test_login(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site2/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
        'username': '******', 'password': '******'})

    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # take a look at the captures table
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
    captures = list(rr.table('captures').filter(
                {'test_id':test_id}).order_by('timestamp').run())
    meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]

    # there are several forms in in htdocs/site2/login.html but only one
    # that brozzler's heuristic should match and try to submit, and it has
    # action='00', so we can check for that here
    assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url

    # sanity check the rest of the crawl
    assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
    assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url

コード例 #26

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: galgeek/brozzler

def test_hashtag_links():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)

    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)
    parent_page = frontier.seed_page(site.id)
    assert not parent_page.hashtags
    outlinks = [
        'http://example.org/#foo',
        'http://example.org/bar',
        'http://example.org/bar#baz',
        'http://example.org/bar#quux',
        'http://example.org/zuh#buh',
    ]
    frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)

    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 3
    assert pages[0].url == 'http://example.org/'
    assert sorted(pages[0].outlinks['accepted']) == [
            'http://example.org/', 'http://example.org/bar',
            'http://example.org/zuh']
    assert not pages[0].outlinks['blocked']
    assert not pages[0].outlinks['rejected']
    assert pages[0].hashtags == ['#foo',]
    assert pages[0].hops_from_seed == 0

    assert pages[1].url == 'http://example.org/bar'
    assert sorted(pages[1].hashtags) == ['#baz','#quux']
    assert pages[1].priority == 36
    assert pages[1].hops_from_seed == 1

    assert pages[2].url == 'http://example.org/zuh'
    assert pages[2].hashtags == ['#buh']
    assert pages[2].priority == 12

コード例 #27

0

ファイルを表示

def test_warcprox_outage_resiliency(httpd):
    '''
    Tests resiliency to warcprox outage.

    If no instances of warcprox are healthy when starting to crawl a site,
    brozzler-worker should sit there and wait until a healthy instance appears.

    If an instance goes down, sites assigned to that instance should bounce
    over to a healthy instance.

    If all instances of warcprox go down, brozzler-worker should sit and wait.
    '''
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    svcreg = doublethink.ServiceRegistry(rr)

    # run two instances of warcprox
    opts = warcprox.Options()
    opts.address = '0.0.0.0'
    opts.port = 0

    warcprox1 = warcprox.controller.WarcproxController(
            service_registry=svcreg, options=opts)
    warcprox2 = warcprox.controller.WarcproxController(
            service_registry=svcreg, options=opts)
    warcprox1_thread = threading.Thread(
            target=warcprox1.run_until_shutdown, name='warcprox1')
    warcprox2_thread = threading.Thread(
            target=warcprox2.run_until_shutdown, name='warcprox2')

    # put together a site to crawl
    test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    try:
        # we manage warcprox instances ourselves, so stop the one running on
        # the system, if any
        try:
            stop_service('warcprox')
        except Exception as e:
            logging.warn('problem stopping warcprox service: %s', e)

        # queue the site for brozzling
        brozzler.new_site(frontier, site)

        # check that nothing happens
        # XXX tail brozzler-worker.log or something?
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == 1

        # start one instance of warcprox
        warcprox1_thread.start()

        # check that it started using that instance
        start = time.time()
        while not site.proxy and time.time() - start < 30:
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port)

        # check that the site accumulates pages in the frontier, confirming
        # that crawling is really happening
        start = time.time()
        while (len(list(frontier.site_pages(site.id))) <= 1
               and time.time() - start < 60):
            time.sleep(0.5)
            site.refresh()
        assert len(list(frontier.site_pages(site.id))) > 1

        # stop warcprox #1, start warcprox #2
        warcprox2_thread.start()
        warcprox1.stop.set()
        warcprox1_thread.join()

        # check that it switched over to warcprox #2
        start = time.time()
        while ((not site.proxy
                or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port))
               and time.time() - start < 30):
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port)

        # stop warcprox #2
        warcprox2.stop.set()
        warcprox2_thread.join()

        page_count = len(list(frontier.site_pages(site.id)))
        assert page_count > 1

        # check that it is waiting for a warcprox to appear
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == page_count

        # stop crawling the site, else it can pollute subsequent test runs
        brozzler.cli.brozzler_stop_crawl([
            'brozzler-stop-crawl', '--site=%s' % site.id])
        site.refresh()
        assert site.stop_requested

        # stop request should be honored quickly
        start = time.time()
        while not site.status.startswith(
                'FINISHED') and time.time() - start < 120:
            time.sleep(0.5)
            site.refresh()
        assert site.status == 'FINISHED_STOP_REQUESTED'
    finally:
        warcprox1.stop.set()
        warcprox2.stop.set()
        warcprox1_thread.join()
        warcprox2_thread.join()
        start_service('warcprox')

コード例 #28

0

ファイルを表示

def _test_proxy_setting(
        httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
    test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
            proxy, warcprox_auto, is_warcprox,
            datetime.datetime.utcnow().isoformat())

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=1,
            chrome_exe=brozzler.suggest_default_chrome_exe(),
            warcprox_auto=warcprox_auto, proxy=proxy)
    browser = worker._browser_pool.acquire()
    worker.brozzle_site(browser, site)
    worker._browser_pool.release(browser)

    # check proxy is set
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy
    site.refresh() # check that these things were persisted
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    if is_warcprox:
        assert robots in captures_by_url
        assert page1 in captures_by_url
        assert page2 in captures_by_url
        assert 'screenshot:%s' % page1 in captures_by_url
        assert 'thumbnail:%s' % page1 in captures_by_url

        # check pywb
        t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
        wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
        expected_payload = open(os.path.join(
            os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
        assert requests.get(wb_url).content == expected_payload
    else:
        assert captures_by_url == {}

コード例 #29

0

ファイルを表示

def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    # so we can examine rethinkdb before it does anything
    try:
        stop_service('brozzler-worker')

        assert site.id is None
        frontier = brozzler.RethinkDbFrontier(rr)
        brozzler.new_site(frontier, site)
        assert site.id is not None
        assert len(list(frontier.site_pages(site.id))) == 1
    finally:
        start_service('brozzler-worker')

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert robots in captures_by_url
    assert page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext

    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload

    url = 'screenshot:%s' % page1
    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
    response = requests.get(wb_url)
    assert response.status_code == 200
    assert response.headers['content-type'] == 'image/jpeg'

    url = 'thumbnail:%s' % page1
    t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
    response = requests.get(wb_url)
    assert response.status_code == 200
    assert response.headers['content-type'] == 'image/jpeg'

コード例 #30

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: galgeek/brozzler

def test_max_hops_off():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://example.com/',
        'scope': {
            'max_hops_off_surt': 1,
            'blocks': [{'ssurt': 'domain,bad,'}]}})
    brozzler.new_site(frontier, site)
    site.refresh()  # get it back from the db

    # renamed this param
    assert not 'max_hops_off_surt' in site.scope
    assert site.scope['max_hops_off'] == 1

    seed_page = frontier.seed_page(site.id)

    assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
    assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None
    assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True
    assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False

    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        # two of these are in scope because of max_hops_off
        frontier.scope_and_schedule_outlinks(site, seed_page, [
            'http://foo.org/', 'https://example.com/toot',
            'http://example.com/toot', 'https://some.bad.domain/something'])
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots

    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)

    assert len(pages) == 4
    assert pages[0].url == 'http://example.com/'
    assert pages[0].hops_off == 0
    assert not 'hops_off_surt' in pages[0]
    assert set(pages[0].outlinks['accepted']) == {
            'https://example.com/toot', 'http://foo.org/',
            'http://example.com/toot'}
    assert pages[0].outlinks['blocked'] == []
    assert pages[0].outlinks['rejected'] == [
            'https://some.bad.domain/something']
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://example.com/toot',
        'via_page_id': seed_page.id
    } in pages
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://foo.org/',
        'via_page_id': seed_page.id
    } in pages
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'https://example.com/toot',
        'via_page_id': seed_page.id
    } in pages

    # next hop is past max_hops_off, but normal in scope url is in scope
    foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(site, foo_page, [
            'http://foo.org/bar', 'http://example.com/blah'])
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert foo_page == {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://foo.org/',
        'via_page_id': seed_page.id,
        'outlinks': {
            'accepted': ['http://example.com/blah'],
            'blocked': [],
            'rejected': ['http://foo.org/bar'],
        }
    }
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 5
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 2,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 11,
        'site_id': site.id,
        'url': 'http://example.com/blah',
        'via_page_id': foo_page.id
    } in pages

コード例 #31

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: galgeek/brozzler

def _test_proxy_setting(
        httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
    test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
            proxy, warcprox_auto, is_warcprox,
            datetime.datetime.utcnow().isoformat())

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=1,
            chrome_exe=brozzler.suggest_default_chrome_exe(),
            warcprox_auto=warcprox_auto, proxy=proxy)
    browser = worker._browser_pool.acquire()
    worker.brozzle_site(browser, site)
    worker._browser_pool.release(browser)

    # check proxy is set
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy
    site.refresh() # check that these things were persisted
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    if is_warcprox:
        assert robots in captures_by_url
        assert page1 in captures_by_url
        assert page2 in captures_by_url
        assert 'screenshot:%s' % page1 in captures_by_url
        assert 'thumbnail:%s' % page1 in captures_by_url

        # check pywb
        t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
        wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
        expected_payload = open(os.path.join(
            os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
        assert requests.get(wb_url).content == expected_payload
    else:
        assert captures_by_url == {}

コード例 #32

0

ファイルを表示

ファイル: test_cluster.py プロジェクト: galgeek/brozzler

def test_warcprox_outage_resiliency(httpd):
    '''
    Tests resiliency to warcprox outage.

    If no instances of warcprox are healthy when starting to crawl a site,
    brozzler-worker should sit there and wait until a healthy instance appears.

    If an instance goes down, sites assigned to that instance should bounce
    over to a healthy instance.

    If all instances of warcprox go down, brozzler-worker should sit and wait.
    '''
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    svcreg = doublethink.ServiceRegistry(rr)

    # run two instances of warcprox
    opts = warcprox.Options()
    opts.address = '0.0.0.0'
    opts.port = 0
    opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services'

    warcprox1 = warcprox.controller.WarcproxController(opts)
    warcprox2 = warcprox.controller.WarcproxController(opts)
    warcprox1_thread = threading.Thread(
            target=warcprox1.run_until_shutdown, name='warcprox1')
    warcprox2_thread = threading.Thread(
            target=warcprox2.run_until_shutdown, name='warcprox2')

    # put together a site to crawl
    test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    try:
        # we manage warcprox instances ourselves, so stop the one running on
        # the system, if any
        try:
            stop_service('warcprox')
        except Exception as e:
            logging.warn('problem stopping warcprox service: %s', e)

        # queue the site for brozzling
        brozzler.new_site(frontier, site)

        # check that nothing happens
        # XXX tail brozzler-worker.log or something?
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == 1

        # start one instance of warcprox
        warcprox1_thread.start()

        # check that it started using that instance
        start = time.time()
        while not site.proxy and time.time() - start < 30:
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port)

        # check that the site accumulates pages in the frontier, confirming
        # that crawling is really happening
        start = time.time()
        while (len(list(frontier.site_pages(site.id))) <= 1
               and time.time() - start < 60):
            time.sleep(0.5)
            site.refresh()
        assert len(list(frontier.site_pages(site.id))) > 1

        # stop warcprox #1, start warcprox #2
        warcprox2_thread.start()
        warcprox1.stop.set()
        warcprox1_thread.join()

        # check that it switched over to warcprox #2
        start = time.time()
        while ((not site.proxy
                or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port))
               and time.time() - start < 30):
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port)

        # stop warcprox #2
        warcprox2.stop.set()
        warcprox2_thread.join()

        page_count = len(list(frontier.site_pages(site.id)))
        assert page_count > 1

        # check that it is waiting for a warcprox to appear
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == page_count

        # stop crawling the site, else it can pollute subsequent test runs
        brozzler.cli.brozzler_stop_crawl([
            'brozzler-stop-crawl', '--site=%s' % site.id])
        site.refresh()
        assert site.stop_requested

        # stop request should be honored quickly
        start = time.time()
        while not site.status.startswith(
                'FINISHED') and time.time() - start < 120:
            time.sleep(0.5)
            site.refresh()
        assert site.status == 'FINISHED_STOP_REQUESTED'
    finally:
        warcprox1.stop.set()
        warcprox2.stop.set()
        warcprox1_thread.join()
        warcprox2_thread.join()
        start_service('warcprox')

コード例 #33

0

ファイルを表示

ファイル: test_frontier.py プロジェクト: wolfgang42/brozzler

def test_max_hops_off():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(
        rr, {
            'seed': 'http://example.com/',
            'scope': {
                'max_hops_off_surt': 1,
                'blocks': [{
                    'ssurt': 'domain,bad,'
                }]
            }
        })
    brozzler.new_site(frontier, site)
    site.refresh()  # get it back from the db

    # renamed this param
    assert not 'max_hops_off_surt' in site.scope
    assert site.scope['max_hops_off'] == 1

    seed_page = frontier.seed_page(site.id)

    assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
    assert site.accept_reject_or_neither('https://example.com/toot',
                                         seed_page) is None
    assert site.accept_reject_or_neither('http://example.com/toot',
                                         seed_page) is True
    assert site.accept_reject_or_neither('https://some.bad.domain/something',
                                         seed_page) is False

    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        # two of these are in scope because of max_hops_off
        frontier.scope_and_schedule_outlinks(site, seed_page, [
            'http://foo.org/', 'https://example.com/toot',
            'http://example.com/toot', 'https://some.bad.domain/something'
        ])
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots

    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)

    assert len(pages) == 4
    assert pages[0].url == 'http://example.com/'
    assert pages[0].hops_off == 0
    assert not 'hops_off_surt' in pages[0]
    assert set(pages[0].outlinks['accepted']) == {
        'https://example.com/toot', 'http://foo.org/',
        'http://example.com/toot'
    }
    assert pages[0].outlinks['blocked'] == []
    assert pages[0].outlinks['rejected'] == [
        'https://some.bad.domain/something'
    ]
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://example.com/toot',
        'via_page_id': seed_page.id
    } in pages
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://foo.org/',
        'via_page_id': seed_page.id
    } in pages
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'https://example.com/toot',
        'via_page_id': seed_page.id
    } in pages

    # next hop is past max_hops_off, but normal in scope url is in scope
    foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
    orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
    brozzler.is_permitted_by_robots = lambda *args: True
    try:
        frontier.scope_and_schedule_outlinks(
            site, foo_page, ['http://foo.org/bar', 'http://example.com/blah'])
    finally:
        brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
    assert foo_page == {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 1,
        'hops_off': 1,
        'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 12,
        'site_id': site.id,
        'url': 'http://foo.org/',
        'via_page_id': seed_page.id,
        'outlinks': {
            'accepted': ['http://example.com/blah'],
            'blocked': [],
            'rejected': ['http://foo.org/bar'],
        }
    }
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 5
    assert {
        'brozzle_count': 0,
        'claimed': False,
        'hashtags': [],
        'hops_from_seed': 2,
        'hops_off': 0,
        'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'),
        'job_id': None,
        'needs_robots_check': False,
        'priority': 11,
        'site_id': site.id,
        'url': 'http://example.com/blah',
        'via_page_id': foo_page.id
    } in pages