def test_robots(httpd): ''' Basic test of robots.txt user-agent substring matching. ''' url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'}) assert brozzler.is_permitted_by_robots(site, url) site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) assert not brozzler.is_permitted_by_robots(site, url)
def test_robots(httpd): ''' Basic test of robots.txt user-agent substring matching. ''' url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep') assert brozzler.is_permitted_by_robots(site, url) site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') assert not brozzler.is_permitted_by_robots(site, url)
def test_robots_socket_timeout(): stop_hanging = threading.Event() class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): stop_hanging.wait(60) self.connection.sendall( b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n') orig_timeout = brozzler.robots._SessionRaiseOn420.timeout httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) brozzler.robots._SessionRaiseOn420.timeout = 2 assert brozzler.is_permitted_by_robots(site, url) finally: brozzler.robots._SessionRaiseOn420.timeout = orig_timeout stop_hanging.set() httpd.shutdown() httpd.server_close() httpd_thread.join()
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( dict of {page_id: Page} of fresh `brozzler.Page` representing in scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) decision = site.accept_reject_or_neither( url_for_scoping, parent_page=parent_page) if decision is True: hops_off = 0 elif decision is None: decision = parent_page.hops_off < site.scope.get( 'max_hops_off', 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( site, parent_page, url, hops_off) if fresh_page.id in pages: self._merge_page(pages[fresh_page.id], fresh_page) else: pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return pages, blocked, out_of_scope
def test_robots_http_statuses(): for status in (200, 204, 400, 401, 402, 403, 404, 405, 500, 501, 502, 503, 504, 505): class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): response = (('HTTP/1.1 %s Meaningless message\r\n' + 'Content-length: 0\r\n' + '\r\n') % status).encode('utf-8') self.connection.sendall(response) # self.send_response(status) # self.end_headers() httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join()
def test_proxy_down(): ''' Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. This test needs to cover every possible fetch through the proxy other than fetches from the browser. For that, see test_brozzling.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) site = brozzler.Site(None, { 'id': str(uuid.uuid4()), 'seed': 'http://example.com/' }) page = brozzler.Page(None, {'url': 'http://example.com/'}) # robots.txt fetch with pytest.raises(brozzler.ProxyError): brozzler.is_permitted_by_robots(site, 'http://example.com/', proxy=not_listening_proxy) # youtube-dl fetch with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = worker._youtube_dl(tempdir, site) with pytest.raises(brozzler.ProxyError): worker._try_youtube_dl(ydl, site, page) # raw fetch with pytest.raises(brozzler.ProxyError): worker._fetch_url(site, page) # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): worker._warcprox_write_record( warcprox_address=not_listening_proxy, url='test://proxy_down/warcprox_write_record', warc_type='metadata', content_type='text/plain', payload=b'''payload doesn't matter here''')
def brozzle_site(self, browser, site): try: start = time.time() page = None self._frontier.honor_stop_request(site) self.logger.info("brozzling site (proxy=%r) %r", self._proxy_for(site), site) while time.time() - start < 7 * 60: site.refresh() self._frontier.honor_stop_request(site) page = self._frontier.claim_page( site, "%s:%s" % (socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): logging.warn("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page(browser, site, page) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db( ) page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( 'proxy error (site.proxy=%s), will try to choose a ' 'healthy instance next time site is brozzled: %s', site.proxy, e) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error('proxy error (site.proxy=%r): %r', site.proxy, e) except: self.logger.critical("unexpected exception", exc_info=True) finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start self._frontier.disclaim_site(site, page)
def test_proxy_down(): ''' Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. This test needs to cover every possible fetch through the proxy other than fetches from the browser. For that, see test_brozzling.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ( '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): worker = brozzler.BrozzlerWorker( frontier=None, proxy=not_listening_proxy) site = brozzler.Site(None, { 'id': str(uuid.uuid4()), 'seed': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'}) # robots.txt fetch with pytest.raises(brozzler.ProxyError): brozzler.is_permitted_by_robots( site, 'http://example.com/', proxy=not_listening_proxy) # youtube-dl fetch with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: with pytest.raises(brozzler.ProxyError): brozzler.ydl.do_youtube_dl(worker, site, page) # raw fetch with pytest.raises(brozzler.ProxyError): worker._fetch_url(site, page.url) # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): worker._warcprox_write_record( warcprox_address=not_listening_proxy, url='test://proxy_down/warcprox_write_record', warc_type='metadata', content_type='text/plain', payload=b'''payload doesn't matter here''')
def new_site(frontier, site): logging.info("new site {}".format(site)) frontier.new_site(site) try: if brozzler.is_permitted_by_robots(site, site.seed): page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000) frontier.new_page(page) logging.info("queued page %s", page) else: logging.warn("seed url {} is blocked by robots.txt".format(site.seed)) except brozzler.ReachedLimit as e: frontier.reached_limit(site, e)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): if site.remember_outlinks: parent_page.outlinks = {"accepted": [], "blocked": [], "rejected": []} counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: u = brozzler.site.Url(url) if site.is_in_scope(u, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, url): if not u.surt.startswith(site.scope["surt"]): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed + 1, via_page_id=parent_page.id, hops_off_surt=hops_off_surt, ) existing_child_page = self.page(new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority self.update_page(existing_child_page) counts["updated"] += 1 else: self.new_page(new_child_page) counts["added"] += 1 if site.remember_outlinks: parent_page.outlinks["accepted"].append(url) else: counts["blocked"] += 1 if site.remember_outlinks: parent_page.outlinks["blocked"].append(url) else: counts["rejected"] += 1 if site.remember_outlinks: parent_page.outlinks["rejected"].append(url) if site.remember_outlinks: self.update_page(parent_page) self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page, )
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): if site.remember_outlinks: parent_page.outlinks = { "accepted": [], "blocked": [], "rejected": [] } counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: u = brozzler.site.Url(url) if site.is_in_scope(u, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, url): if not u.surt.startswith(site.scope["surt"]): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed + 1, via_page_id=parent_page.id, hops_off_surt=hops_off_surt) existing_child_page = self.page(new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority self.update_page(existing_child_page) counts["updated"] += 1 else: self.new_page(new_child_page) counts["added"] += 1 if site.remember_outlinks: parent_page.outlinks["accepted"].append(url) else: counts["blocked"] += 1 if site.remember_outlinks: parent_page.outlinks["blocked"].append(url) else: counts["rejected"] += 1 if site.remember_outlinks: parent_page.outlinks["rejected"].append(url) if site.remember_outlinks: self.update_page(parent_page) self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def test_robots_empty_response(): class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): self.connection.shutdown(socket.SHUT_RDWR) self.connection.close() httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join()
def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site)) try: # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: if brozzler.is_permitted_by_robots(site, site.seed): page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000) frontier.new_page(page) logging.info("queued page %s", page) else: logging.warn("seed url {} is blocked by robots.txt".format(site.seed)) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) except brozzler.ReachedLimit as e: frontier.reached_limit(site, e)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): counts = {"added":0,"updated":0,"rejected":0,"blocked":0} if outlinks: for url in outlinks: if site.is_in_scope(url, parent_page): if brozzler.is_permitted_by_robots(site, url): new_child_page = brozzler.Page(url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed+1, via_page_id=parent_page.id) existing_child_page = self.page(new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority self.update_page(existing_child_page) counts["updated"] += 1 else: self.new_page(new_child_page) counts["added"] += 1 else: counts["blocked"] += 1 else: counts["rejected"] += 1 self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def _brozzle_site(self, browser, site): page = None try: start = time.time() while time.time() - start < 7 * 60: self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots(site, page.url)): logging.warn("page %s is blocked by robots.txt", page.url) else: outlinks = self.brozzle_page(browser, site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() self._frontier.completed_page(site, page) page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlJobStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser) with self._browsing_threads_lock: self._browsing_threads.remove(threading.current_thread())
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( set of in scope urls (uncanonicalized) accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' in_scope = set() blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): in_scope.add(url) else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return in_scope, blocked, out_of_scope
def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site)) try: # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: if brozzler.is_permitted_by_robots(site, site.seed): page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000) frontier.new_page(page) logging.info("queued page %s", page) else: logging.warn("seed url %s is blocked by robots.txt", site.seed) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) except brozzler.ReachedLimit as e: frontier.reached_limit(site, e)
def test_robots_http_statuses(): for status in ( 200, 204, 400, 401, 402, 403, 404, 405, 500, 501, 502, 503, 504, 505): class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): response = (('HTTP/1.1 %s Meaningless message\r\n' + 'Content-length: 0\r\n' + '\r\n') % status).encode('utf-8') self.connection.sendall(response) # self.send_response(status) # self.end_headers() httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join()
def brozzle_site(self, browser, site): try: site.last_claimed_by = '%s:%s' % ( socket.gethostname(), browser.chrome.port) site.save() start = time.time() page = None self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) # _proxy_for() call in log statement can raise brozzler.ProxyError # which is why we honor time limit and stop request first☝🏻 self.logger.info( "brozzling site (proxy=%r) %s", self._proxy_for(site), site) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): logging.warning("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page( browser, site, page, enable_youtube_dl=not self._skip_youtube_dl) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.ReachedTimeLimit as e: self._frontier.finished(site, "FINISHED_TIME_LIMIT") except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( 'proxy error (site.proxy=%s), will try to choose a ' 'healthy instance next time site is brozzled: %s', site.proxy, e) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error( 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) except: self.logger.error( 'unexpected exception site=%r page=%r', site, page, exc_info=True) if page: page.failed_attempts = (page.failed_attempts or 0) + 1 if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: self.logger.info( 'marking page "completed" after %s unexpected ' 'exceptions attempting to brozzle %s', page.failed_attempts, page) self._frontier.completed_page(site, page) page = None finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start self._frontier.disclaim_site(site, page)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {"accepted": set(), "blocked": set(), "rejected": set()} counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if not url_for_scoping.surt().startswith( site.scope["surt"].encode("utf-8")): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt }) existing_child_page = brozzler.Page.load( self.rr, new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority if hashtag and existing_child_page.hashtags: hashtags = set(existing_child_page.hashtags) hashtags.add(hashtag) existing_child_page.hashtags = list(hashtags) elif hashtag: existing_child_page.hashtags = [hashtag] existing_child_page.save() counts["updated"] += 1 else: if hashtag: new_child_page.hashtags = [ hashtag, ] new_child_page.save() counts["added"] += 1 decisions["accepted"].add(str(url_for_crawling)) else: counts["blocked"] += 1 decisions["blocked"].add(str(url_for_crawling)) else: counts["rejected"] += 1 decisions["rejected"].add(str(url_for_crawling)) parent_page.outlinks = {} for k in decisions: parent_page.outlinks[k] = list(decisions[k]) parent_page.save() self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def test_robots_dns_failure(): # .invalid. is guaranteed nonexistent per rfc 6761 url = 'http://whatever.invalid./' site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url)
def test_robots_connection_failure(): # .invalid. is guaranteed nonexistent per rfc 6761 url = 'http://localhost:4/' # nobody listens on port 4 site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url)
def brozzle_site(self, browser, site): try: site.last_claimed_by = '%s:%s' % ( socket.gethostname(), browser.chrome.port) site.save() start = time.time() page = None self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) # _proxy_for() call in log statement can raise brozzler.ProxyError # which is why we honor time limit and stop request first☝🏻 self.logger.info( "brozzling site (proxy=%r) %s", self._proxy_for(site), site) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): logging.warn("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page( browser, site, page, enable_youtube_dl=not self._skip_youtube_dl) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.ReachedTimeLimit as e: self._frontier.finished(site, "FINISHED_TIME_LIMIT") except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( 'proxy error (site.proxy=%s), will try to choose a ' 'healthy instance next time site is brozzled: %s', site.proxy, e) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error( 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) except: self.logger.critical("unexpected exception", exc_info=True) finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start self._frontier.disclaim_site(site, page)