Example #1
0
 def _brozzle_site(self, browser, site):
     start = time.time()
     page = None
     try:
         while (not self._shutdown_requested.is_set()
                and time.time() - start < 7 * 60):
             self._frontier.honor_stop_request(site.job_id)
             page = self._frontier.claim_page(
                 site,
                 "%s:%s" % (socket.gethostname(), browser.chrome_port))
             outlinks = self.brozzle_page(browser, site, page)
             site.cookie_db = browser.persist_and_read_cookie_db()
             self._frontier.completed_page(site, page)
             self._frontier.scope_and_schedule_outlinks(
                 site, page, outlinks)
             page = None
     except brozzler.NothingToClaim:
         self.logger.info("no pages left for site %s", site)
     except brozzler.ReachedLimit as e:
         self._frontier.reached_limit(site, e)
     except brozzler.CrawlJobStopped:
         self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
     except brozzler.browser.BrowsingAborted:
         self.logger.info("{} shut down".format(browser))
     except:
         self.logger.critical("unexpected exception", exc_info=True)
     finally:
         self.logger.info("finished session brozzling site, stopping "
                          "browser and disclaiming site")
         browser.stop()
         self._frontier.disclaim_site(site, page)
         self._browser_pool.release(browser)
         self._browsing_threads.remove(threading.current_thread())
Example #2
0
 def _brozzle_site_thread_target(self, browser, site):
     try:
         self.brozzle_site(browser, site)
     finally:
         browser.stop()
         self._browser_pool.release(browser)
         with self._browsing_threads_lock:
             self._browsing_threads.remove(threading.current_thread())
Example #3
0
 def _brozzle_site_thread_target(self, browser, site):
     try:
         self.brozzle_site(browser, site)
     finally:
         browser.stop()
         self._browser_pool.release(browser)
         with self._browsing_threads_lock:
             self._browsing_threads.remove(threading.current_thread())
Example #4
0
    def _brozzle_site(self, browser, site):
        page = None
        try:
            start = time.time()
            while time.time() - start < 7 * 60:
                self._frontier.honor_stop_request(site.job_id)
                page = self._frontier.claim_page(site, "%s:%s" % (
                    socket.gethostname(), browser.chrome.port))

                if (page.needs_robots_check and
                        not brozzler.is_permitted_by_robots(site, page.url)):
                    logging.warn("page %s is blocked by robots.txt", page.url)
                else:
                    outlinks = self.brozzle_page(browser, site, page)
                    self._frontier.scope_and_schedule_outlinks(
                            site, page, outlinks)
                    if browser.is_running():
                        site.cookie_db = browser.chrome.persist_and_read_cookie_db()

                self._frontier.completed_page(site, page)
                page = None
        except brozzler.ShutdownRequested:
            self.logger.info("shutdown requested")
        except brozzler.NothingToClaim:
            self.logger.info("no pages left for site %s", site)
        except brozzler.ReachedLimit as e:
            self._frontier.reached_limit(site, e)
        except brozzler.CrawlJobStopped:
            self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
        # except brozzler.browser.BrowsingAborted:
        #     self.logger.info("{} shut down".format(browser))
        except:
            self.logger.critical("unexpected exception", exc_info=True)
        finally:
            browser.stop()
            self._frontier.disclaim_site(site, page)
            self._browser_pool.release(browser)
            with self._browsing_threads_lock:
                self._browsing_threads.remove(threading.current_thread())
Example #5
0
 def _brozzle_site(self, browser, ydl, site):
     start = time.time()
     page = None
     try:
         browser.start(proxy=site.proxy)
         while not self._shutdown_requested.is_set() and time.time() - start < 60:
             page = self._frontier.claim_page(site, self._id)
             outlinks = self.brozzle_page(browser, ydl, site, page)
             self._frontier.completed_page(site, page)
             self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
             page = None
     except brozzler.NothingToClaim:
         self.logger.info("no pages left for site %s", site)
     except brozzler.ReachedLimit as e:
         self._frontier.reached_limit(site, e)
     except brozzler.browser.BrowsingAborted:
         self.logger.info("{} shut down".format(browser))
     except:
         self.logger.critical("unexpected exception", exc_info=True)
     finally:
         self.logger.info("finished session brozzling site, stopping browser and disclaiming site")
         browser.stop()
         self._frontier.disclaim_site(site, page)
         self._browser_pool.release(browser)