def _brozzle_site(self, browser, site): start = time.time() page = None try: while (not self._shutdown_requested.is_set() and time.time() - start < 7 * 60): self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page( site, "%s:%s" % (socket.gethostname(), browser.chrome_port)) outlinks = self.brozzle_page(browser, site, page) if browser.is_running(): site.cookie_db = browser.persist_and_read_cookie_db() self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) page = None except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlJobStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") except brozzler.browser.BrowsingAborted: self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: self.logger.info("finished session brozzling site, stopping " "browser and disclaiming site") browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser) self._browsing_threads.remove(threading.current_thread())
def brozzle_page(self, browser, site, page, on_screenshot=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) elif self._proxy(site) and self._enable_warcprox_features(site): self.logger.info("sending WARCPROX_WRITE_RECORD request " "to warcprox with screenshot for %s", page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record(warcprox_address=self._proxy(site), url="screenshot:%s" % brozzler.fixup(page.url), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record(warcprox_address=self._proxy(site), url="thumbnail:%s" % brozzler.fixup(page.url), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) self.logger.info("brozzling {}".format(page)) try: with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = self._youtube_dl(tempdir, site) ydl_spy = ydl.brozzler_spy # remember for later self._try_youtube_dl(ydl, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: raise except Exception as e: if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 and hasattr(e.exc_info[1], 'code') and e.exc_info[1].code == 430): self.logger.info( 'youtube-dl got %s %s processing %s', e.exc_info[1].code, e.exc_info[1].msg, page.url) else: self.logger.error( "youtube_dl raised exception on %s", page, exc_info=True) if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) if not browser.is_running(): browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.behavior_parameters, user_agent=site.user_agent, on_screenshot=_on_screenshot) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks else: if not self._already_fetched(page, ydl_spy): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) return []
def brozzle_site(self, browser, site): try: start = time.time() page = None self._frontier.honor_stop_request(site) self.logger.info("brozzling site (proxy=%r) %r", self._proxy_for(site), site) while time.time() - start < 7 * 60: site.refresh() self._frontier.honor_stop_request(site) page = self._frontier.claim_page( site, "%s:%s" % (socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): logging.warn("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page(browser, site, page) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db( ) page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( 'proxy error (site.proxy=%s), will try to choose a ' 'healthy instance next time site is brozzled: %s', site.proxy, e) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error('proxy error (site.proxy=%r): %r', site.proxy, e) except: self.logger.critical("unexpected exception", exc_info=True) finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start self._frontier.disclaim_site(site, page)
def _brozzle_site(self, browser, site): page = None try: start = time.time() while time.time() - start < 7 * 60: self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots(site, page.url)): logging.warn("page %s is blocked by robots.txt", page.url) else: outlinks = self.brozzle_page(browser, site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() self._frontier.completed_page(site, page) page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.CrawlJobStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser) with self._browsing_threads_lock: self._browsing_threads.remove(threading.current_thread())
def brozzle_site(self, browser, site): try: site.last_claimed_by = '%s:%s' % ( socket.gethostname(), browser.chrome.port) site.save() start = time.time() page = None self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) # _proxy_for() call in log statement can raise brozzler.ProxyError # which is why we honor time limit and stop request first☝🏻 self.logger.info( "brozzling site (proxy=%r) %s", self._proxy_for(site), site) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): logging.warning("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page( browser, site, page, enable_youtube_dl=not self._skip_youtube_dl) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.ReachedTimeLimit as e: self._frontier.finished(site, "FINISHED_TIME_LIMIT") except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( 'proxy error (site.proxy=%s), will try to choose a ' 'healthy instance next time site is brozzled: %s', site.proxy, e) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error( 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) except: self.logger.error( 'unexpected exception site=%r page=%r', site, page, exc_info=True) if page: page.failed_attempts = (page.failed_attempts or 0) + 1 if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: self.logger.info( 'marking page "completed" after %s unexpected ' 'exceptions attempting to brozzle %s', page.failed_attempts, page) self._frontier.completed_page(site, page) page = None finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start self._frontier.disclaim_site(site, page)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_jpeg): if on_screenshot: on_screenshot(screenshot_jpeg) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) def _on_response(chrome_msg): if ('params' in chrome_msg and 'response' in chrome_msg['params'] and 'mimeType' in chrome_msg['params']['response'] and chrome_msg['params']['response'].get('mimeType', '').startswith('video/') # skip manifests of DASH segmented video - # see https://github.com/internetarchive/brozzler/pull/70 and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd' and chrome_msg['params']['response'].get('status') in (200, 206)): video = { 'blame': 'browser', 'url': chrome_msg['params']['response'].get('url'), 'response_code': chrome_msg['params']['response']['status'], 'content-type': chrome_msg['params']['response']['mimeType'], } response_headers = CaseInsensitiveDict( chrome_msg['params']['response']['headers']) if 'content-length' in response_headers: video['content-length'] = int(response_headers['content-length']) if 'content-range' in response_headers: video['content-range'] = response_headers['content-range'] logging.debug('embedded video %s', video) if not 'videos' in page: page.videos = [] page.videos.append(video) sw_fetched = set() def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 self.logger.trace('%r', chrome_msg) if chrome_msg.get('params', {}).get('versions'): url = chrome_msg.get('params', {}).get('versions')[0]\ .get('scriptURL') if url and url not in sw_fetched: self.logger.info('fetching service worker script %s', url) self._fetch_url(site, url) sw_fetched.add(url) if not browser.is_running(): browser.start( proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, on_request=on_request, on_service_worker_version_updated=_on_service_worker_version_updated, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, simpler404=self._simpler404, screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout, download_throughput=self._download_throughput) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks
def brozzle_site(self, browser, site): try: site.last_claimed_by = '%s:%s' % ( socket.gethostname(), browser.chrome.port) site.save() start = time.time() page = None self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) # _proxy_for() call in log statement can raise brozzler.ProxyError # which is why we honor time limit and stop request first☝🏻 self.logger.info( "brozzling site (proxy=%r) %s", self._proxy_for(site), site) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): logging.warn("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page( browser, site, page, enable_youtube_dl=not self._skip_youtube_dl) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") except brozzler.NothingToClaim: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) except brozzler.ReachedTimeLimit as e: self._frontier.finished(site, "FINISHED_TIME_LIMIT") except brozzler.CrawlStopped: self._frontier.finished(site, "FINISHED_STOP_REQUESTED") # except brozzler.browser.BrowsingAborted: # self.logger.info("{} shut down".format(browser)) except brozzler.ProxyError as e: if self._warcprox_auto: logging.error( 'proxy error (site.proxy=%s), will try to choose a ' 'healthy instance next time site is brozzled: %s', site.proxy, e) site.proxy = None else: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error( 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) except: self.logger.critical("unexpected exception", exc_info=True) finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start self._frontier.disclaim_site(site, page)
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) def _on_response(chrome_msg): if ('params' in chrome_msg and 'response' in chrome_msg['params'] and 'mimeType' in chrome_msg['params']['response'] and chrome_msg['params']['response'].get('mimeType', '').startswith('video/') # skip manifests of DASH segmented video - # see https://github.com/internetarchive/brozzler/pull/70 and chrome_msg['params']['response']['mimeType'] != 'video/vnd.mpeg.dash.mpd' and chrome_msg['params']['response'].get('status') in (200, 206)): video = { 'blame': 'browser', 'url': chrome_msg['params']['response'].get('url'), 'response_code': chrome_msg['params']['response']['status'], 'content-type': chrome_msg['params']['response']['mimeType'], } response_headers = CaseInsensitiveDict( chrome_msg['params']['response']['headers']) if 'content-length' in response_headers: video['content-length'] = int(response_headers['content-length']) if 'content-range' in response_headers: video['content-range'] = response_headers['content-range'] logging.debug('embedded video %s', video) if not 'videos' in page: page.videos = [] page.videos.append(video) sw_fetched = set() def _on_service_worker_version_updated(chrome_msg): # https://github.com/internetarchive/brozzler/issues/140 self.logger.trace('%r', chrome_msg) if chrome_msg.get('params', {}).get('versions'): url = chrome_msg.get('params', {}).get('versions')[0]\ .get('scriptURL') if url and url not in sw_fetched: self.logger.info('fetching service worker script %s', url) self._fetch_url(site, url) sw_fetched.add(url) if not browser.is_running(): browser.start( proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, on_request=on_request, on_service_worker_version_updated=_on_service_worker_version_updated, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks
def brozzle_page(self, browser, site, page, on_screenshot=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) elif self._proxy(site) and self._enable_warcprox_features(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request " "to warcprox with screenshot for %s", page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record(warcprox_address=self._proxy(site), url="screenshot:{}".format( page.url), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record(warcprox_address=self._proxy(site), url="thumbnail:{}".format( page.url), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) self.logger.info("brozzling {}".format(page)) try: with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = self._youtube_dl(tempdir, site) ydl_spy = ydl.brozzler_spy # remember for later self._try_youtube_dl(ydl, site, page) except brozzler.ReachedLimit as e: raise except Exception as e: if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 and hasattr(e.exc_info[1], 'code') and e.exc_info[1].code == 430): self.logger.info('youtube-dl got %s %s processing %s', e.exc_info[1].code, e.exc_info[1].msg, page.url) else: self.logger.error("youtube_dl raised exception on %s", page, exc_info=True) if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) if not browser.is_running(): browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db) outlinks = browser.browse_page(page.url, extra_headers=site.extra_headers(), on_screenshot=_on_screenshot, on_url_change=page.note_redirect) return outlinks else: if not self._already_fetched(page, ydl_spy): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) return []
def _browse_page(self, browser, site, page, on_screenshot=None): def _on_screenshot(screenshot_png): if on_screenshot: on_screenshot(screenshot_png) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="thumbnail:%s" % str(urlcanon.semantic(page.url)), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) def _on_response(chrome_msg): if ('params' in chrome_msg and 'response' in chrome_msg['params'] and 'mimeType' in chrome_msg['params']['response'] and chrome_msg['params']['response'].get( 'mimeType', '').startswith('video/') and chrome_msg['params']['response'].get('status') in (200, 206)): video = { 'blame': 'browser', 'url': chrome_msg['params']['response'].get('url'), 'response_code': chrome_msg['params']['response']['status'], 'content-type': chrome_msg['params']['response']['mimeType'], } response_headers = CaseInsensitiveDict( chrome_msg['params']['response']['headers']) if 'content-length' in response_headers: video['content-length'] = int( response_headers['content-length']) if 'content-range' in response_headers: video['content-range'] = response_headers['content-range'] logging.debug('embedded video %s', video) if not 'videos' in page: page.videos = [] page.videos.append(video) if not browser.is_running(): browser.start(proxy=self._proxy_for(site), cookie_db=site.get('cookie_db')) final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks