def test_time_limit(httpd): test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) # create a new job with one seed that could be crawled forever job_conf = { 'seeds': [{ 'url': make_url(httpd, '/infinite/foo/'), 'time_limit': 20 }] } job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] # time limit should be enforced pretty soon start = time.time() while not sites[0].status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() assert sites[0].status == 'FINISHED_TIME_LIMIT' # all sites finished so job should be finished too start = time.time() job.refresh() while not job.status == 'FINISHED' and time.time() - start < 10: time.sleep(0.5) job.refresh() assert job.status == 'FINISHED'
def test_stop_crawl(httpd): test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) # create a new job with three sites that could be crawled forever job_conf = {'seeds': [ {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port}, {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port}, {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert not sites[0].stop_requested assert not sites[1].stop_requested # request crawl stop for one site using the command line entrypoint brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--site=%s' % sites[0].id]) sites[0].refresh() assert sites[0].stop_requested # stop request should be honored quickly start = time.time() while not sites[0].status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() assert sites[0].status == 'FINISHED_STOP_REQUESTED' # but the other sites and the job as a whole should still be crawling sites[1].refresh() assert sites[1].status == 'ACTIVE' sites[2].refresh() assert sites[2].status == 'ACTIVE' job.refresh() assert job.status == 'ACTIVE' # request crawl stop for the job using the command line entrypoint brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--job=%s' % job.id]) job.refresh() assert job.stop_requested # stop request should be honored quickly start = time.time() while not job.status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) job.refresh() assert job.status == 'FINISHED' # the other sites should also be FINISHED_STOP_REQUESTED sites[0].refresh() assert sites[0].status == 'FINISHED_STOP_REQUESTED' sites[1].refresh() assert sites[1].status == 'FINISHED_STOP_REQUESTED' sites[2].refresh() assert sites[2].status == 'FINISHED_STOP_REQUESTED'
def test_time_limit(httpd): test_id = 'test_time_limit-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) # create a new job with one seed that could be crawled forever job_conf = {'seeds': [{ 'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port, 'time_limit': 20}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] # time limit should be enforced pretty soon start = time.time() while not sites[0].status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() assert sites[0].status == 'FINISHED_TIME_LIMIT' # all sites finished so job should be finished too start = time.time() job.refresh() while not job.status == 'FINISHED' and time.time() - start < 10: time.sleep(0.5) job.refresh() assert job.status == 'FINISHED'
def test_honor_stop_request(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # 1. test stop request on job job_conf = {'seeds': [{'url': 'http://example.com'}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] assert site.job_id == job.id # does not raise exception frontier.honor_stop_request(site) # set job.stop_requested job.stop_requested = datetime.datetime.utcnow().replace( tzinfo=doublethink.UTC) job.save() with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site) # 2. test stop request on site job_conf = {'seeds': [{'url': 'http://example.com'}]} job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert len(sites) == 1 site = sites[0] assert site.job_id == job.id # does not raise exception frontier.honor_stop_request(site) # set site.stop_requested site.stop_requested = doublethink.utcnow() site.save() with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site)
def test_max_claimed_sites(): # max_claimed_sites is a brozzler job setting that puts a cap on the number # of the job's sites that can be brozzled simultaneously across the cluster rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # clean slate rr.table('jobs').delete().run() rr.table('sites').delete().run() job_conf = { 'seeds': [ { 'url': 'http://example.com/1' }, { 'url': 'http://example.com/2' }, { 'url': 'http://example.com/3' }, { 'url': 'http://example.com/4' }, { 'url': 'http://example.com/5' }, ], 'max_claimed_sites': 3, } job = brozzler.new_job(frontier, job_conf) assert job.id assert job.max_claimed_sites == 3 sites = list(frontier.job_sites(job.id)) assert len(sites) == 5 claimed_sites = frontier.claim_sites(1) assert len(claimed_sites) == 1 claimed_sites = frontier.claim_sites(3) assert len(claimed_sites) == 2 with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites(3) # clean slate for the next one rr.table('jobs').delete().run() rr.table('sites').delete().run()
def test_max_claimed_sites(): # max_claimed_sites is a brozzler job setting that puts a cap on the number # of the job's sites that can be brozzled simultaneously across the cluster rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # clean slate rr.table('jobs').delete().run() rr.table('sites').delete().run() job_conf = { 'seeds': [ {'url': 'http://example.com/1'}, {'url': 'http://example.com/2'}, {'url': 'http://example.com/3'}, {'url': 'http://example.com/4'}, {'url': 'http://example.com/5'}, ], 'max_claimed_sites': 3, } job = brozzler.new_job(frontier, job_conf) assert job.id assert job.max_claimed_sites == 3 sites = list(frontier.job_sites(job.id)) assert len(sites) == 5 claimed_sites = frontier.claim_sites(1) assert len(claimed_sites) == 1 claimed_sites = frontier.claim_sites(3) assert len(claimed_sites) == 2 with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites(3) # clean slate for the next one rr.table('jobs').delete().run() rr.table('sites').delete().run()
def test_basics(): rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [ {'url': 'http://example.com'}, {'url': 'https://example.org/'}]} job = brozzler.new_job(frontier, job_conf) assert job.id assert job.starts_and_stops assert job.starts_and_stops[0]['start'] assert job == { 'id': job.id, 'conf': { 'seeds': [ {'url': 'http://example.com'}, {'url': 'https://example.org/'} ] }, 'status': 'ACTIVE', 'starts_and_stops': [ { 'start': job.starts_and_stops[0]['start'], 'stop': None } ] } sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) assert len(sites) == 2 assert sites[0].starts_and_stops[0]['start'] assert sites[1].starts_and_stops[0]['start'] assert sites[0] == { 'claimed': False, 'id': sites[0].id, 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, 'scope': { 'surt': 'http://(com,example,)/' }, 'seed': 'http://example.com', 'starts_and_stops': [ { 'start': sites[0].starts_and_stops[0]['start'], 'stop': None } ], 'status': 'ACTIVE' } assert sites[1] == { 'claimed': False, 'id': sites[1].id, 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, 'scope': { 'surt': 'https://(org,example,)/', }, 'seed': 'https://example.org/', 'starts_and_stops': [ { 'start': sites[1].starts_and_stops[0]['start'], 'stop': None, }, ], 'status': 'ACTIVE', } pages = list(frontier.site_pages(sites[0].id)) assert len(pages) == 1 assert pages[0] == { 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, 'hops_off_surt': 0, 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), 'job_id': job.id, 'needs_robots_check': True, 'priority': 1000, 'site_id': sites[0].id, 'url': 'http://example.com', } pages = list(frontier.site_pages(sites[1].id)) assert len(pages) == 1 assert pages[0] == { 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, 'hops_off_surt': 0, 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), 'job_id': job.id, 'needs_robots_check': True, 'priority': 1000, 'site_id': sites[1].id, 'url': 'https://example.org/', } # test "brozzled" parameter of frontier.site_pages assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1 pages[0].brozzle_count = 1 pages[0].save() assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0 pages[0].brozzle_count = 32819 pages[0].save() assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
def test_resume_job(): ''' Tests that the right stuff gets twiddled in rethinkdb when we "start" and "finish" crawling a job. Doesn't actually crawl anything. ''' # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [{'url': 'http://example.com/'}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 1 site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) job.refresh() assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start'] # resuming a job == resuming all of its sites frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start'] frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 4 assert job.starts_and_stops[3]['start'] assert job.starts_and_stops[3]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 4 assert site.starts_and_stops[3]['start'] assert site.starts_and_stops[3]['stop'] is None # simulate a job stop request job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 2 site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC) job.save() # should raise a CrawlStopped with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site1) frontier.finished(site1, 'FINISHED_STOP_REQUESTED') frontier.finished(site2, 'FINISHED_STOP_REQUESTED') job.refresh() assert job.status == 'FINISHED' assert job.stop_requested assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] assert site1.status == 'FINISHED_STOP_REQUESTED' assert site2.status == 'FINISHED_STOP_REQUESTED' assert len(site1.starts_and_stops) == 1 assert len(site2.starts_and_stops) == 1 assert site1.starts_and_stops[0]['start'] assert site1.starts_and_stops[0]['stop'] assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] assert site2.starts_and_stops[0]['start'] assert site2.starts_and_stops[0]['stop'] assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] # simulate job resume after a stop request frontier.resume_job(job) site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] assert job.status == 'ACTIVE' assert job.stop_requested is None assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site1.status == 'ACTIVE' assert len(site1.starts_and_stops) == 2 assert site1.starts_and_stops[1]['start'] assert site1.starts_and_stops[1]['stop'] is None assert site2.status == 'ACTIVE' assert len(site2.starts_and_stops) == 2 assert site2.starts_and_stops[1]['start'] assert site2.starts_and_stops[1]['stop'] is None # simulate a site stop request site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC) site1.save() # should not raise a CrawlStopped frontier.honor_stop_request(site2) frontier.finished(site1, 'FINISHED_STOP_REQUESTED') job.refresh() assert job.status == 'ACTIVE' assert job.stop_requested is None assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site1.status == 'FINISHED_STOP_REQUESTED' assert len(site1.starts_and_stops) == 2 assert site1.starts_and_stops[1]['start'] assert site1.starts_and_stops[1]['stop'] assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start'] assert site2.status == 'ACTIVE' assert len(site2.starts_and_stops) == 2 assert site2.starts_and_stops[1]['start'] assert site2.starts_and_stops[1]['stop'] is None # simulate site resume after a stop request frontier.resume_site(site1) site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] assert job.status == 'ACTIVE' assert job.stop_requested is None assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site1.status == 'ACTIVE' assert site1.stop_requested is None assert len(site1.starts_and_stops) == 3 assert site1.starts_and_stops[2]['start'] assert site1.starts_and_stops[2]['stop'] is None assert site2.status == 'ACTIVE' assert len(site2.starts_and_stops) == 2 assert site2.starts_and_stops[1]['start'] assert site2.starts_and_stops[1]['stop'] is None
def test_resume_job(): ''' Tests that the right stuff gets twiddled in rethinkdb when we "start" and "finish" crawling a job. Doesn't actually crawl anything. ''' # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [{'url': 'http://example.com/'}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 1 site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) job.refresh() assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start'] # resuming a job == resuming all of its sites frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
def test_basics(): rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [ {'url': 'http://example.com'}, {'url': 'https://example.org/'}]} job = brozzler.new_job(frontier, job_conf) assert job.id assert job.starts_and_stops assert job.starts_and_stops[0]['start'] assert job == { 'id': job.id, 'conf': { 'seeds': [ {'url': 'http://example.com'}, {'url': 'https://example.org/'} ] }, 'status': 'ACTIVE', 'starts_and_stops': [ { 'start': job.starts_and_stops[0]['start'], 'stop': None } ] } sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) assert len(sites) == 2 assert sites[0].starts_and_stops[0]['start'] assert sites[1].starts_and_stops[0]['start'] assert sites[0] == { 'claimed': False, 'id': sites[0].id, 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, 'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]}, 'seed': 'http://example.com', 'starts_and_stops': [ { 'start': sites[0].starts_and_stops[0]['start'], 'stop': None } ], 'status': 'ACTIVE' } assert sites[1] == { 'claimed': False, 'id': sites[1].id, 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, 'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]}, 'seed': 'https://example.org/', 'starts_and_stops': [ { 'start': sites[1].starts_and_stops[0]['start'], 'stop': None, }, ], 'status': 'ACTIVE', } pages = list(frontier.site_pages(sites[0].id)) assert len(pages) == 1 assert pages[0] == { 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), 'job_id': job.id, 'needs_robots_check': True, 'priority': 1000, 'site_id': sites[0].id, 'url': 'http://example.com', } pages = list(frontier.site_pages(sites[1].id)) assert len(pages) == 1 assert pages[0] == { 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), 'job_id': job.id, 'needs_robots_check': True, 'priority': 1000, 'site_id': sites[1].id, 'url': 'https://example.org/', } # test "brozzled" parameter of frontier.site_pages assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1 pages[0].brozzle_count = 1 pages[0].save() assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0 pages[0].brozzle_count = 32819 pages[0].save() assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0