def _brozzle_site(self, browser, site): start = time.time() page = None try: while (not self._shutdown_requested.is_set() and time.time() - start < 7 * 60): self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page( site, "%s:%s" % (socket.gethostname(), browser.chrome_port)) outlinks = self.brozzle_page(browser, site, page) site.cookie_db = browser.persist_and_read_cookie_db() self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) page = None except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlJobStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") except brozzler.browser.BrowsingAborted: self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: self.logger.info("finished session brozzling site, stopping " "browser and disclaiming site") browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser) self._browsing_threads.remove(threading.current_thread())
def _brozzle_site_thread_target(self, browser, site): try: self.brozzle_site(browser, site) finally: browser.stop() self._browser_pool.release(browser) with self._browsing_threads_lock: self._browsing_threads.remove(threading.current_thread())
def _brozzle_site(self, browser, site): page = None try: start = time.time() while time.time() - start < 7 * 60: self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots(site, page.url)): logging.warn("page %s is blocked by robots.txt", page.url) else: outlinks = self.brozzle_page(browser, site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() self._frontier.completed_page(site, page) page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlJobStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser) with self._browsing_threads_lock: self._browsing_threads.remove(threading.current_thread())
def _brozzle_site(self, browser, ydl, site): start = time.time() page = None try: browser.start(proxy=site.proxy) while not self._shutdown_requested.is_set() and time.time() - start < 60: page = self._frontier.claim_page(site, self._id) outlinks = self.brozzle_page(browser, ydl, site, page) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks(site, page, outlinks) page = None except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.browser.BrowsingAborted: self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: self.logger.info("finished session brozzling site, stopping browser and disclaiming site") browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser)